* This do file merges aggregate and micro-level datasets and prepares the dataset.

set more off
global mainpath "/Users/ekurt/Desktop/MP ASYM/Stata"
set seed 1073741823
set sortseed 473847
cap log close
log using "${mainpath}/Log/DataPrep/1.Prepare dataset.log", replace


* 1. PREPARE MACRO DATA
clear
import excel "${mainpath}/Aggregate Data/US DATA QUARTERLY.xlsx", sheet("Sheet1") firstrow clear

ren Year cyear
ren Quarter cquarter
drop if cyear ==.

gen dloggdp = loggdp-loggdp[_n-1]
gen dGS1 = GS1-GS1[_n-1] 
gen dGS1_scaled = dGS1*(1/0.25) 

tempfile aggregate
save `aggregate'

use "${mainpath}/Raw_data/Wieland_MP/RR_monetary_shock_quarterly", clear

sort 	date
gen 	date2 = _n
replace date2 = (date2 - 1) * 0.25 + 1969

gen cyear = floor(date2)
gen cquarter = .

sort cyear date2
by cyear: replace cquarter = _n 

ren resid_romer RRHybrid
ren resid_full RRExtended
ren resid RROriginal

drop date date2

tempfile RRshocks
save `RRshocks'

*import mp shocks
import excel "${mainpath}/Aggregate Data/mp shocks/mp shocks.xlsx", sheet("shocks_q") firstrow clear

ren Year cyear
ren Quarter cquarter
drop if cyear ==.

tempfile mpEXTRshocks_EK
save `mpEXTRshocks_EK'

import excel "${mainpath}/Aggregate Data/VAR_data2_GK.xlsx", sheet("VARq") firstrow clear

ren Year cyear
ren Quarter cquarter
drop if cyear ==.
tempfile GK_var
save `GK_var'

* 2. PREPARE MICRO DATA

use "${mainpath}/Raw_data/Comp_QRaw.dta", clear 

gen cyear    = substr(datacqtr,1,4) 
gen cquarter = substr(datacqtr, -1, 1) 

gen sdate    = string(datadate, "%ddMCY")

*** supplement the cyear with datadate when empty.
gen cyear2    = substr(sdate,-4,4) 
replace cyear = cyear2 if cyear== ""
drop cyear2

*** supplement missing quarters. 
gen quar1 		 = substr(sdate,3,14)
gen month_prefix = substr(quar1, 1,strlen(quar1)-4)

gen 	qua_prefix = 1 if month_prefix == "January" | month_prefix == "February" | month_prefix == "March" 
replace qua_prefix = 2 if month_prefix == "April"   | month_prefix == "May"      | month_prefix == "June" 
replace qua_prefix = 3 if month_prefix == "July"    | month_prefix == "August"   | month_prefix == "September" 
replace qua_prefix = 4 if month_prefix == "October" | month_prefix == "November" | month_prefix == "December" 

egen calendar = concat(cyear cquarter) 

destring cyear cquarter calendar, replace
replace  cquarter = qua_prefix if cquarter==.

sort gvkey cyear cquarter

gen qdate = yq(cyear,cquarter)
format qdate %tq

gen 	fyear = substr(datafqtr,1,4) 
gen 	fquarter = substr(datafqtr, -1, 1) 

destring fyear fquarter, replace 

sort 	gvkey fyear fquarter

gen    fdate = yq(fyear,fquarter)
format fdate %tq

keep if curncdq == "USD"
keep if fic     == "USA"

capture drop gvkeyid
egen gvkeyid = group(gvkey)

gen sic_sector = substr(sic,1,2)  
destring sic sic_sector naics, replace
encode gvkey, gen(gvkeyn)
encode conm, gen(conmn)

* 3. DEAL WITH "Y" VARIABLES; CUMULATIVE OVER THE FISCAL YEAR.

global variables_qcum capxy sstky prstkcy dvy dltisy dltry txpdy cdvcy tdcy depcy aqcy 

foreach v of varlist $variables_qcum {  
	sort gvkeyn fyear fyr fquarter 
	gen diff_`v' = .
	by gvkeyn fyear fyr: replace diff_`v' = `v' if fquarter == 1
	by gvkeyn fyear fyr: replace diff_`v' = `v'[_n] - `v'[_n-1] if fquarter != 1
}

* 4. MAKE SURE WE HAVE A PANEL

drop if datacqtr == ""
sort gvkeyn qdate
isid gvkeyn qdate 
xtset gvkeyn qdate

tsfill

sort gvkeyn qdate
by gvkeyn: gen Trend = _n

* Refill cyear cquarter after tsfill 
format qdate %10.0g

gen date = dofq(qdate)

format date %d

replace cyear = year(date)
replace cquarter = quarter(date)

format qdate %tq

foreach v of varlist $variables_qcum {  

	sort gvkeyn qdate
	by gvkeyn: ipolate diff_`v' qdate, gen(ipolated_`v')
	sort gvkeyn qdate
	global lab: variable label `v'
	label var ipolated_`v' "${lab} (`v')"
	local newname = substr("`v'", 1, length("`v'")-1)  
	ren ipolated_`v' `newname'q
} 

global variables_q txditcq prccq saleq cogsq ugiq xoprq oiadpq xintq /// 
				   piq txtq xrdq  uniamiq cheq chq rectq invtq actq atq apq dlcq ///
				   ppegtq ppentq aoq txpq lctq dlttq ltq pstkq ceqq cstkq ///
				   req  teqq  lseq xiq conmn exchg ///
				   dldte sic naics cshoq mkvaltq  dd1q prchq prclq ///   

foreach v of varlist $variables_q {  

	sort gvkeyn qdate 
	by gvkeyn: ipolate `v' qdate, gen(ipolated_`v')
	sort gvkeyn qdate
	global lab: variable label `v'
	label var ipolated_`v' "${lab} (`v')"
	ren `v' `v'_original
	ren ipolated_`v' `v'
	
}
	 
sort gvkeyn
by gvkeyn: gen f_obssize_t=_N

* 6. MERGE THE MACRO DATA

merge m:1 cyear cquarter using `aggregate', keep(1 3) nogen
merge m:1 cyear cquarter using `mpEXTRshocks_EK', keep(1 3) nogen
merge m:1 cyear cquarter using `GK_var', keep(1 3) nogen
merge m:1 cyear cquarter using `RRshocks', keep(1 3) nogen

xtset gvkeyn qdate
save "${mainpath}/Data/US_fund_merged_basicCleaning.dta",replace

use "${mainpath}/Data/CRSP/CRSP.dta", clear

keep if ipodate !=.
gen ipoyear = year(ipodate)

tempfile crsp_ipo
save `crsp_ipo'

use "${mainpath}/Data/Ipo Ritter/IPO RITTER.dta", clear

generate cusipnew = ustrregexrf(cusip,"^0+","")
drop if foundingdate < 0

tempfile ritter_ipo
save `ritter_ipo'

use "${mainpath}/Data/US_fund_merged_basicCleaning.dta", clear

generate cusipnew = ustrregexrf(cusip,"^0+","")

merge m:m cusipnew using `ritter_ipo'
drop if _merge ==2
rename _merge _merge1

merge m:m gvkey using `crsp_ipo'
drop if _merge ==2
rename _merge _merge2

drop crsppermanentid

bys gvkey: gen start = . 
bys gvkey: replace start = min(ipoyear,foundingdate)

bys gvkeyn: egen start_full = mean(start)

preserve
keep if start_full !=.
cap drop firmtag
egen firmtag = tag(gvkey) 
count if firmtag 
restore

g age = cyear - start_full
sum age,det 

cap drop nageflag sumnageflag
gen nageflag = .
by gvkeyn: replace  nageflag = 1 if age < 0
by gvkeyn: egen sumnageflag = sum(nageflag)

save "${mainpath}/Data/US_fund_merged_basicCleaning_age.dta",replace

cap log close
